nyc_weather <- tibble(read.csv("data/NYC_weather_1869_2022.csv")) %>%
dplyr::select(TMAX, TMIN, DATE, PRCP, SNOW)
# view structure
str(nyc_weather)
## tibble [56,093 × 5] (S3: tbl_df/tbl/data.frame)
## $ TMAX: int [1:56093] 25 26 36 43 44 35 36 26 38 46 ...
## $ TMIN: int [1:56093] 17 4 19 28 19 11 20 20 26 32 ...
## $ DATE: chr [1:56093] "1869-02-28" "1869-03-01" "1869-03-02" "1869-03-03" ...
## $ PRCP: num [1:56093] 0 0 0.02 0 0.1 0 0.03 0 0 0 ...
## $ SNOW: num [1:56093] 0 0 0 0 0 0 0.8 0 0 0 ...
# get summary
xkablesummary(nyc_weather)
| TMAX | TMIN | DATE | PRCP | SNOW | |
|---|---|---|---|---|---|
| Min | Min. : 2.0 | Min. :-15.0 | Length:56093 | Min. :0.00 | Min. : 0.0 |
| Q1 | 1st Qu.: 46.0 | 1st Qu.: 34.0 | Class :character | 1st Qu.:0.00 | 1st Qu.: 0.0 |
| Median | Median : 63.0 | Median : 47.0 | Mode :character | Median :0.00 | Median : 0.0 |
| Mean | Mean : 61.5 | Mean : 46.8 | NA | Mean :0.12 | Mean : 0.1 |
| Q3 | 3rd Qu.: 78.0 | 3rd Qu.: 62.0 | NA | 3rd Qu.:0.05 | 3rd Qu.: 0.0 |
| Max | Max. :106.0 | Max. : 87.0 | NA | Max. :8.28 | Max. :27.3 |
| NA | NA’s :7 | NA’s :7 | NA | NA | NA’s :163 |
Convert DATE column from char to Date
type
nyc_weather <- nyc_weather %>%
mutate(DATE = as.Date(DATE, format="%Y-%m-%d"))
# review dataset structure
xkablesummary(nyc_weather)
| TMAX | TMIN | DATE | PRCP | SNOW | |
|---|---|---|---|---|---|
| Min | Min. : 2.0 | Min. :-15.0 | Min. :1869-02-28 | Min. :0.00 | Min. : 0.0 |
| Q1 | 1st Qu.: 46.0 | 1st Qu.: 34.0 | 1st Qu.:1907-07-23 | 1st Qu.:0.00 | 1st Qu.: 0.0 |
| Median | Median : 63.0 | Median : 47.0 | Median :1945-12-13 | Median :0.00 | Median : 0.0 |
| Mean | Mean : 61.5 | Mean : 46.8 | Mean :1945-12-13 | Mean :0.12 | Mean : 0.1 |
| Q3 | 3rd Qu.: 78.0 | 3rd Qu.: 62.0 | 3rd Qu.:1984-05-05 | 3rd Qu.:0.05 | 3rd Qu.: 0.0 |
| Max | Max. :106.0 | Max. : 87.0 | Max. :2022-09-26 | Max. :8.28 | Max. :27.3 |
| NA | NA’s :7 | NA’s :7 | NA | NA | NA’s :163 |
Add a MONTH column to the dataset. The month is extracted from the
DATE.
Subset data for DATE > 1900
nyc_weather <- nyc_weather %>%
filter(lubridate::year(DATE) >= 1900) %>%
mutate(MONTH = lubridate::month(DATE, label=T))
str(nyc_weather)
## tibble [44,829 × 6] (S3: tbl_df/tbl/data.frame)
## $ TMAX : int [1:44829] 20 23 26 32 39 40 43 40 33 41 ...
## $ TMIN : int [1:44829] 14 13 19 19 29 32 31 20 15 30 ...
## $ DATE : Date[1:44829], format: "1900-01-01" "1900-01-02" ...
## $ PRCP : num [1:44829] 0.03 0 0 0 0 0 0 0.01 0 0 ...
## $ SNOW : num [1:44829] 0.5 0 0 0 0 0 0 0 0 0 ...
## $ MONTH: Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
# review dataset structure
xkablesummary(nyc_weather)
| TMAX | TMIN | DATE | PRCP | SNOW | MONTH | |
|---|---|---|---|---|---|---|
| Min | Min. : 2.0 | Min. :-15.0 | Min. :1900-01-01 | Min. :0.00 | Min. : 0.0 | Jan : 3813 |
| Q1 | 1st Qu.: 47.0 | 1st Qu.: 34.0 | 1st Qu.:1930-09-08 | 1st Qu.:0.00 | 1st Qu.: 0.0 | Mar : 3813 |
| Median | Median : 63.0 | Median : 48.0 | Median :1961-05-15 | Median :0.00 | Median : 0.0 | May : 3813 |
| Mean | Mean : 62.1 | Mean : 47.2 | Mean :1961-05-15 | Mean :0.13 | Mean : 0.1 | Jul : 3813 |
| Q3 | 3rd Qu.: 78.0 | 3rd Qu.: 62.0 | 3rd Qu.:1992-01-20 | 3rd Qu.:0.05 | 3rd Qu.: 0.0 | Aug : 3813 |
| Max | Max. :106.0 | Max. : 87.0 | Max. :2022-09-26 | Max. :7.57 | Max. :27.3 | Oct : 3782 |
| NA | NA | NA | NA | NA | NA’s :96 | (Other):21982 |
mean_tmax <- mean(nyc_weather$TMAX)
mean_tmin <- mean(nyc_weather$TMIN)
ggplot(nyc_weather) +
geom_histogram(aes(x=TMAX, fill="Max Temp."), na.rm=TRUE, alpha=0.5, color="black", bins=100, binwidth=2) +
geom_vline(xintercept=mean_tmax, color="red", size=1, linetype=5, show.legend=FALSE) +
annotate("text", x=mean_tmax + 9, y=2000, label=paste(round(mean_tmax, 2), "°F" ), angle=0, size=4, color="darkred") +
geom_histogram(aes(x=TMIN, fill="Min Temp."), na.rm=TRUE, alpha=0.5, color="black", bins=100, binwidth=2) +
geom_vline(xintercept=mean_tmin, color="blue", size=1, linetype=5, show.legend=FALSE) +
annotate("text", x=mean_tmin - 9, y=2000, label=paste(round(mean_tmin, 2), "°F" ), angle=0, size=4, color="darkblue") +
labs(title="Distribution of Minimum and Maximum Daily Temperatures", x="Temperature (°F)", y="Count") +
scale_y_continuous(position = "right") +
scale_fill_manual(name="Column", values=c("Max Temp."="red","Min Temp."="blue"))
nyc_weather %>%
ggplot(aes(y=TMAX)) +
geom_boxplot(na.rm=TRUE, alpha=0.7, color="black", fill="red") +
labs(title="TMAX Boxplot", y="Temperature (°F)")
nyc_weather %>%
ggplot(aes(y=TMIN)) +
geom_boxplot(na.rm=TRUE, alpha=0.7, color="black", fill="steelblue") +
labs(title="TMIN Boxplot", x="", y="Temperature (°F)")
qqnorm(nyc_weather$TMAX,
main="QQ Plot TMAX",
xlab="Theoretical Quantile",
ylab="Temperature (°F)",
col="red")
qqline(nyc_weather$TMAX, lwd=1, lty=2)
qqnorm(nyc_weather$TMIN,
main="QQ Plot TMIN",
xlab="Theoretical Quantile",
ylab="Temperature (°F)",
col="blue")
qqline(nyc_weather$TMIN, lwd=1, lty=2)
# group data by month
# and calculate average
nyc_weather_by_month <- nyc_weather %>%
group_by(MONTH) %>%
summarize("Avg Max Temp." = mean(TMAX, na.rm=T),
"Avg Min Temp." = mean(TMIN, na.rm=T),
avg_prcp = mean(PRCP, na.rm=T),
avg_snow = mean(SNOW, na.rm=T))
# side-by-side bar plot
nyc_weather_by_month %>%
dplyr::select(MONTH, "Avg Max Temp.", "Avg Min Temp.") %>%
gather(key="Value", value="Temp.", "Avg Max Temp.", "Avg Min Temp.") %>%
ggplot(aes(x=MONTH, y=Temp., fill=Value)) +
geom_col(na.rm=TRUE, alpha=0.5, color="black", position="dodge") +
labs(title="Average Daily Temperature By Month", x="Month", y="Average Temperature (°F)") +
scale_fill_manual(values=c("blue", "red"))
nyc_weather_by_month %>%
dplyr::select(MONTH, avg_prcp, avg_snow) %>%
gather(key = "data_point", value="value", avg_prcp, avg_snow) %>%
ggplot(aes(x=MONTH, y=value, fill=data_point)) +
geom_col(na.rm=TRUE, alpha=0.7, color="black", position="dodge") +
labs(x="Month", y="Amount (inches)")
For this analysis, I sought to subset the data into 30-50 year
periods as such:
* 1900-1940: Industrial Revolution
* 1940-1980: Cold War Era
* 1980-present: Modern Era
# filter weather data for years 1900-1940
industrial_era <- nyc_weather %>%
dplyr::select(-MONTH) %>%
filter(lubridate::year(DATE) >= 1900, lubridate::year(DATE) < 1940)
str(industrial_era)
## tibble [14,609 × 5] (S3: tbl_df/tbl/data.frame)
## $ TMAX: int [1:14609] 20 23 26 32 39 40 43 40 33 41 ...
## $ TMIN: int [1:14609] 14 13 19 19 29 32 31 20 15 30 ...
## $ DATE: Date[1:14609], format: "1900-01-01" "1900-01-02" ...
## $ PRCP: num [1:14609] 0.03 0 0 0 0 0 0 0.01 0 0 ...
## $ SNOW: num [1:14609] 0.5 0 0 0 0 0 0 0 0 0 ...
industrial_era %>%
ggplot() +
geom_histogram(aes(x=TMAX, fill="TMAX"), na.rm=TRUE, alpha=0.8, fill="steelblue", bins=100, binwidth=2) +
labs(title="Distribution of TMAX for Years 1900-1940", x="Temperature (°F)", y="Count")
industrial_era %>%
dplyr::select(TMAX, TMIN, DATE) %>%
gather(key="data_point", value="value", TMAX, TMIN) %>%
ggplot(aes(y=value, fill=data_point)) +
geom_boxplot(na.rm=TRUE, alpha=0.7, color="black") +
labs(title="Distribution of TMAX and TMIN for Years 1900-1940", x="Data Point", y="Temperature (°F)")
industrial_era %>%
dplyr::select(PRCP, SNOW, DATE) %>%
gather(key="data_point", value="value", PRCP, SNOW) %>%
filter(value > 0.0) %>%
ggplot(aes(y=value, fill=data_point)) +
geom_boxplot(na.rm=TRUE, alpha=0.7, color="black") +
labs(title="Distribution of PRCP and SNOW for Years 1900-1940", x="Data Point", y="Amount (inches)")
coldwar_era <- nyc_weather %>%
dplyr::select(-MONTH) %>%
filter(lubridate::year(DATE) >= 1940, lubridate::year(DATE) < 1980)
str(coldwar_era)
## tibble [14,610 × 5] (S3: tbl_df/tbl/data.frame)
## $ TMAX: int [1:14610] 24 28 33 32 29 30 26 24 29 34 ...
## $ TMIN: int [1:14610] 15 17 17 20 16 14 12 19 20 18 ...
## $ DATE: Date[1:14610], format: "1940-01-01" "1940-01-02" ...
## $ PRCP: num [1:14610] 0 0 0 0 0.03 0 0 0.08 0 0 ...
## $ SNOW: num [1:14610] 0 0 0 0 0.3 0 0 1 0 0 ...
coldwar_era %>%
ggplot() +
geom_histogram(aes(x=TMAX, fill="TMAX"), na.rm=TRUE, alpha=0.8, fill="steelblue", bins=100, binwidth=2) +
labs(title="Distribution of TMAX for Years 1940-1980", x="Temperature (°F)", y="Count")
coldwar_era %>%
dplyr::select(TMAX, TMIN, DATE) %>%
gather(key="data_point", value="value", TMAX, TMIN) %>%
ggplot(aes(y=value, fill=data_point)) +
geom_boxplot(na.rm=TRUE, alpha=0.7, color="black") +
labs(title="Distribution of TMAX and TMIN for Years 1940-1980", x="Data Point", y="Temperature (°F)")
coldwar_era %>%
dplyr::select(PRCP, SNOW, DATE) %>%
gather(key="data_point", value="value", PRCP, SNOW) %>%
filter(value > 0.0) %>%
ggplot(aes(y=value, fill=data_point)) +
geom_boxplot(na.rm=TRUE, alpha=0.7, color="black") +
labs(title="Distribution of PRCP and SNOW for Years 1940-1980", x="Data Point", y="Amount (inches)")
modern_era <- nyc_weather %>%
dplyr::select(-MONTH) %>%
filter(lubridate::year(DATE) >= 1980)
str(modern_era)
## tibble [15,610 × 5] (S3: tbl_df/tbl/data.frame)
## $ TMAX: int [1:15610] 45 42 38 30 32 32 40 38 32 34 ...
## $ TMIN: int [1:15610] 34 34 28 21 26 19 27 31 29 23 ...
## $ DATE: Date[1:15610], format: "1980-01-01" "1980-01-02" ...
## $ PRCP: num [1:15610] 0 0 0 0 0.07 0 0 0 0 0 ...
## $ SNOW: num [1:15610] 0 0 0 0 2 0 0 0 0 0 ...
modern_era %>%
ggplot() +
geom_histogram(aes(x=TMAX, fill="TMAX"), na.rm=TRUE, alpha=0.8, fill="steelblue", bins=100, binwidth=2) +
labs(title="Distribution of TMAX for Years 1980-Present", x="Temperature (°F)", y="Count")
modern_era %>%
dplyr::select(TMAX, TMIN, DATE) %>%
gather(key="data_point", value="value", TMAX, TMIN) %>%
ggplot(aes(y=value, fill=data_point)) +
geom_boxplot(na.rm=TRUE, alpha=0.7, color="black") +
labs(title="Distribution of TMAX and TMIN for Years 1980-Present", x="Data Point", y="Temperature (°F)")
modern_era %>%
dplyr::select(PRCP, SNOW, DATE) %>%
gather(key="data_point", value="value", PRCP, SNOW) %>%
filter(value > 0.0) %>%
ggplot(aes(y=value, fill=data_point)) +
geom_boxplot(na.rm=TRUE, alpha=0.7, color="black") +
labs(title="Distribution of PRCP and SNOW for Years 1980-Present", x="Data Point", y="Amount (inches)")
\(H_0:\) The average
TMAX across the three eras is equal.
\(H_1:\) The average TMAX
across the three eras is different.
\(\alpha\): 0.05
# modify data set by adding ERA column
industrial_era <- industrial_era %>%
mutate(Era = 1)
coldwar_era <- coldwar_era %>%
mutate(Era = 2)
modern_era <- modern_era %>%
mutate(Era = 3)
# combine data frames
all_eras <- rbind(industrial_era, coldwar_era, modern_era)
# convert ERA column to factor/categorical
all_eras$Era <- factor(all_eras$Era, labels = c("Indst.", "Cold War", "Modern"))
# box plot
means <- aggregate(TMAX ~ Era, all_eras, mean)
all_eras %>%
ggplot(aes(x=Era, y=TMAX, fill=Era)) +
geom_boxplot(na.rm=TRUE, alpha=0.7, color="black") +
stat_summary_bin(fun="mean", geom="point", shape=20, size=5, show.legend=FALSE) +
geom_text(data = means, aes(label=round(TMAX, 2) , y=TMAX - 5)) +
labs(title="Distribution of Daily Maximum Temperature By Era", x="Era", y="Temperature (°F)") +
scale_fill_brewer(palette="Dark2")
# run ANOVA
anova_res <- aov(TMAX ~ Era, data = all_eras)
summary(anova_res)
## Df Sum Sq Mean Sq F value Pr(>F)
## Era 2 35301 17651 51 <2e-16 ***
## Residuals 44826 15516075 346
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
xkabledply(anova_res)
| Df | Sum Sq | Mean Sq | F value | Pr(>F) | |
|---|---|---|---|---|---|
| Era | 2 | 35301 | 17651 | 51 | 0 |
| Residuals | 44826 | 15516075 | 346 | NA | NA |
With a p-value of 7.519^{-23}, we strongly reject the \(H_0\) that the average daily maximum temperature between the 3 eras is the same.
This tests will analyze the difference between means of the 3 eras:
tukeyAoV <- TukeyHSD(anova_res, conf.level=0.95)
tukeyAoV
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = TMAX ~ Era, data = all_eras)
##
## $Era
## diff lwr upr p adj
## Cold War-Indst. 1.339 0.829 1.85 0
## Modern-Indst. 2.147 1.645 2.65 0
## Modern-Cold War 0.808 0.306 1.31 0
plot(tukeyAoV, col="red", cex.axis=0.75)
The Tukey tests shows that the greatest difference in average daily maximum temperature is between the Industrial and Modern eras.
# extract PRCP outlier rows into df variable
# remove zeros
# group variable by decade, getting sum for each decade
prcp_no_zeros <- nyc_weather %>%
filter(PRCP > 0.0 & !is.na(PRCP))
prcp_boxplot <- boxplot(prcp_no_zeros$PRCP, ylab = "Temperature (°F)")
prcp_outliers <- boxplot.stats(prcp_no_zeros$PRCP)$out
prcp_outliers.index <- which(prcp_no_zeros$PRCP %in% c(prcp_outliers))
prcp_outliers_df <- prcp_no_zeros[prcp_outliers.index, ]
prcp_outliers_df.yearly <- prcp_outliers_df %>%
mutate(YEAR = lubridate::year(DATE)) %>%
group_by(YEAR) %>%
summarize(prcp_total = sum(PRCP, na.rm=T), count = n())
# show top 5 years
top_5_years.prcp_total <- prcp_outliers_df.yearly %>%
slice_max(order_by = prcp_total, n = 5)
prcp_outliers_df.yearly %>%
ggplot(aes(x=YEAR, y=prcp_total)) +
geom_line(na.rm=TRUE, alpha=0.7, color="darkgreen") +
geom_point(na.rm = TRUE, fill="#69b3a2", shape = 21) +
geom_text_repel(aes(label=YEAR), data=top_5_years.prcp_total) +
labs(x="Year", y="Total Yearly Amount (inches)") +
scale_fill_brewer(palette="Spectral")
top_5_years.count <- prcp_outliers_df.yearly %>%
slice_max(order_by = count, n = 5)
prcp_outliers_df.yearly %>%
ggplot(aes(x=YEAR, y=count)) +
geom_line(na.rm=TRUE, alpha=0.7, color="darkgreen") +
geom_point(na.rm = TRUE, fill="#69b3a2", shape = 21) +
geom_text_repel(aes(label=YEAR), data=top_5_years.count) +
labs(title="Precipitation Outlier Days By Year", x="Year", y="Days with Outliers")
# group into decades and create bar plot
prcp_outliers_df.decades <- prcp_outliers_df %>%
mutate(decade = paste(floor(lubridate::year(DATE) / 10) * 10, "s", sep="")) %>%
group_by(decade) %>%
summarise(prcp_total = sum(PRCP, na.rm=T), count = n())
prcp_outliers_df.decades %>%
ggplot(aes(group=1)) +
geom_col(aes(x=decade, y=prcp_total), na.rm=TRUE, alpha=0.8, color="black", fill="darkgreen", position="identity") +
geom_line(aes(x=decade, y=count), stat="identity", color="#69b3a2", size=1) +
labs(title="Precipitation Outlier Days And Total Precipitation By Decade", x="Decade", y="Total Amount (inches)") +
scale_y_continuous(sec.axis=sec_axis(~., name="Days with Outliers"))
prcp_outliers_df.decades %>%
ggplot(aes(x=decade, y=count)) +
geom_col(na.rm=TRUE, alpha=0.7, color="black", fill="#56B4E9", position="identity") +
labs(x="Decade", y="Days with Outliers")
# sum snowfall from 24-25th Dec
nyc_weather.christmas_snow <- nyc_weather %>%
filter(lubridate::month(DATE) == 12, (lubridate::day(DATE) == 24 | lubridate::day(DATE) == 25)) %>%
mutate(YEAR = lubridate::year(DATE)) %>%
group_by(YEAR) %>%
summarise(snowfall = sum(SNOW, na.rm = T))
# how many years has there been snow?
with_snow <- nyc_weather.christmas_snow %>%
filter(snowfall > 0.0)
# highest snowfall year?
largest_snowfall <- nyc_weather.christmas_snow[which.max(nyc_weather.christmas_snow$snowfall), ]
# yearly plot for years it snowed
with_snow %>%
ggplot(aes(x=YEAR, y=snowfall)) +
geom_line(na.rm=TRUE, alpha=0.7, color="steelblue", size=1.25) +
geom_point(na.rm = TRUE, fill="#00edfa", shape = 21) +
geom_label_repel(aes(label=YEAR), data=with_snow) +
labs(title="Years With Snowfall on Christmas Eve/Christmas Day", x="Year", y="Snowfall (inches)")
It has snowed on 24 out of 152 Christmases.
The largest snowfall recorded on Christmas/Christmas Eve was in 1912 when it snowed 11.4 inches!!
weather_data <- data.frame(read.csv("data/NYC_weather_1869_2022.csv"))
weather_data$DATE <- as.Date(weather_data$DATE)
weather_data$YEAR <- as.numeric(format(weather_data$DATE, "%Y"))
weather_data$MONTH<- as.factor(format(weather_data$DATE, "%m"))
weather_data$DAY <- format(weather_data$DATE, "%d")
I broke out the date column into separate Day, Month, and Year columns. I did this to make the data easier to manipulate on a month or year level.
weather_100yr <- subset(weather_data, weather_data$YEAR > 1899 & weather_data$YEAR < 2022)
month_tmax_avg <- aggregate(TMAX ~ MONTH + YEAR, weather_data, mean)
month_tmin_avg <- aggregate(TMIN ~ MONTH + YEAR, weather_data, mean)
month_precip_avg <- aggregate(PRCP ~ MONTH + YEAR, weather_data, mean)
colnames(month_precip_avg)[3] <- "PRCP_AVG"
month_precip_total <- aggregate(PRCP ~ MONTH + YEAR, weather_data, sum)
colnames(month_precip_total)[3] <- "PRCP_TOT"
month_snow_avg <- aggregate(SNOW ~ MONTH + YEAR, weather_data, mean)
colnames(month_snow_avg)[3] <- "SNOW_AVG"
month_snow_total <- aggregate(SNOW ~ MONTH + YEAR, weather_data, sum)
colnames(month_snow_total)[3] <- "SNOT_TOT"
monthly_weather <- data.frame(month_tmax_avg,
month_tmin_avg$TMIN,
month_precip_avg$PRCP_AVG,
month_precip_total$PRCP_TOT,
month_snow_avg$SNOW_AVG,
month_snow_total$SNOT_TOT)
colnames(monthly_weather)[4:8] <- c("TMIN", "PRCP_AVG", "PRCP_TOT", "SNOW_AVG", "SNOW_TOT")
monthly_weather_100yr <- subset(monthly_weather, monthly_weather$YEAR > 1899 & monthly_weather$YEAR < 2022)
Found the monthly average for TMAX, TMIN, PRCP, and SNOW for all months from 1900-2022. Additionally found the total monthly PRCP and SNOW for all months from 1900 - 2022.
year_tmax_avg <- aggregate(TMAX ~ YEAR, weather_data, mean)
year_tmin_avg <- aggregate(TMIN ~ YEAR, weather_data, mean)
year_precip_avg <- aggregate(PRCP ~ YEAR, weather_data, mean)
colnames(year_precip_avg)[2] <- "PRCP_AVG"
year_precip_total <- aggregate(PRCP ~ YEAR, weather_data, sum)
colnames(year_precip_total)[2] <- "PRCP_TOT"
year_snow_avg <- aggregate(SNOW ~ YEAR, weather_data, mean)
colnames(year_snow_avg)[2] <- "SNOW_AVG"
year_snow_total <- aggregate(SNOW ~ YEAR, weather_data, sum)
colnames(year_snow_total)[2] <- "SNOT_TOT"
yearly_weather <- data.frame(year_tmax_avg,
year_tmin_avg$TMIN,
year_precip_avg$PRCP_AVG,
year_precip_total$PRCP_TOT,
year_snow_avg$SNOW_AVG,
year_snow_total$SNOT_TOT)
colnames(yearly_weather)[3:7] <- c("TMIN", "PRCP_AVG", "PRCP_TOT", "SNOW_AVG", "SNOW_TOT")
yearly_weather_100yr <- subset(yearly_weather, yearly_weather$YEAR > 1899 & yearly_weather$YEAR < 2022)
melt_year_data <- melt(yearly_weather_100yr, id.vars = "YEAR")
Found yearly averages for TMAX, TMIN, PRCP, and SNOW. Additionally found the yearly totals for PRCP and SNOW.
yearly_tmax_scatter <- ggplot(yearly_weather_100yr, aes(x = YEAR, y = TMAX)) +
geom_point(color = "coral2") +
labs(x = "Year", y = "Temperature (F)", title = "Yearly Average Maximum Temp")
yearly_tmax_scatter
yearly_tmin_scatter <- ggplot(yearly_weather_100yr, aes(x = YEAR, y = TMIN)) +
geom_point(color = "cornflowerblue") +
labs(x = "Year", y = "Temperature (F)", title = "Yearly Average Minimum Temp")
yearly_tmin_scatter
#Combined Tmax - Tmin plot
max_min_sub <- subset(melt_year_data, variable == "TMAX" | variable == "TMIN")
yearly_max_min_scatter <- ggplot(max_min_sub, aes(x = YEAR, y = value, col = variable)) +
geom_point() +
labs(x = "Year", y = "Temperature (F)", title = "Yearly Average Maximum and Minimum Daily Temperatures in Central Park, NYC, from 1900-2022")
yearly_max_min_scatter
yearly_prcp_scatter <- ggplot(yearly_weather_100yr, aes(x = YEAR, y = PRCP_TOT)) +
geom_point(color = "darkorchid") +
labs(x = "Year", y = "Total Precipitation (in)", title = "Total Yearly Precipitation")
yearly_prcp_scatter
yearly_snow_scatter <- ggplot(yearly_weather_100yr, aes(x = YEAR, y = SNOW_TOT)) +
geom_point(color = "azure4") +
labs(x = "Year", y = "Total Snowfall (in)", title = "Total Yearly Snowfall")
yearly_snow_scatter
The above code generates 5 plots. One each for TMAX, TMIN, PRCP, and SNOW. Then an additional plot that plots both TMAX and TMIN on the same graph.
grid.arrange(yearly_tmax_scatter, yearly_tmin_scatter,
yearly_prcp_scatter, yearly_snow_scatter,
nrow = 2,
top = "Weather Patterns in Central Park, NYC, from 1900-2021")
The quad chart above visualizes the four key weather patterns pulled from this data set. Top left chart: Max Daily Temps over time. There appears to be a clear trend of increasing max temperature over time. Top right chart: Min daily temperatures over time. There appears to be a similar trend of increasing min temp over time, however the change is not as drastic as max temps. Bottom left chart: Total precipitation per year. There does not appear to be any clear trend, however there appears to be more variation over the past 50 years. Bottom right chart: Total snowfall per year. There does not appear to be any clear trend. Potentially interesting to see if any discernable difference in trend from yearly precipitation.
Are there statistically measurable changes in weather patterns (e.g., temperature and precipitation levels) over time in New York City over this window?
Using a Linear Regression to estimate the linear trend and statistical significance over time.
#install.packages("gtsummary")
lm_tmax_yr <- lm(TMAX ~ YEAR, data = yearly_weather_100yr)
#lm_tmax_mtyr <- lm(TMAX ~ YEAR + MONTH, data = monthly_weather_100yr)
#lm_tmax_day <- lm(TMAX ~ YEAR, data = weather_100yr)
summary(lm_tmax_yr)
##
## Call:
## lm(formula = TMAX ~ YEAR, data = yearly_weather_100yr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.925 -0.827 0.076 0.746 3.190
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.5663 6.2657 2.01 0.047 *
## YEAR 0.0253 0.0032 7.90 1.5e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.24 on 120 degrees of freedom
## Multiple R-squared: 0.342, Adjusted R-squared: 0.337
## F-statistic: 62.4 on 1 and 120 DF, p-value: 1.49e-12
#summary(lm_tmax_mtyr)
#summary(lm_tmax_day)
coef_tmax_yr <- c(coefficients(lm_tmax_yr), summary(lm_tmax_yr)$adj.r.squared)
coef_tmax_yr
## (Intercept) YEAR
## 12.5664 0.0252 0.3367
The table above shows the slope and intercept of the TMAX linear
model based on two different approaches.coef_tmax_day uses
the raw daily TMAX for each year in the dataset while
coef_tmax_yr uses the yearly tmax averages.
yr_tmax_scatter_line <- yearly_tmax_scatter+
stat_smooth(method = "lm",
formula = y ~ x,
geom = "smooth",
color = "black",
show.legend = F)
yr_tmax_scatter_line
The regression line added to the yearly TMAX scatterplot. The slope of the line is 0.025. This implies the average daily maximum temperature is increasing by 0.025 degrees F per year, or by about 0.252 degrees F per decade.
lm_tmin_yr <- lm(TMIN ~ YEAR, data = yearly_weather_100yr)
summary(lm_tmin_yr)
##
## Call:
## lm(formula = TMIN ~ YEAR, data = yearly_weather_100yr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7039 -0.9296 0.0013 0.9041 3.1525
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.1490 6.0830 -0.52 0.61
## YEAR 0.0256 0.0031 8.27 2.1e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.21 on 120 degrees of freedom
## Multiple R-squared: 0.363, Adjusted R-squared: 0.358
## F-statistic: 68.3 on 1 and 120 DF, p-value: 2.13e-13
#lm_tmin_day <- lm(TMIN ~ YEAR, data = weather_100yr)
#summary(lm_tmin_day)
#coef_tmin_yr <- coefficients(lm_tmin_yr)
#coef_tmin_day <- coefficients(lm_tmin_day)
The table above shows the slope and intercept of the TMIN linear
model based on two different approaches.coef_tmin_day uses
the raw daily TMIN for each year in the dataset while
coef_tmin_yr uses the yearly TMIN averages.
yr_tmin_scatter_line <- yearly_tmin_scatter +
stat_smooth(method = "lm",
formula = y ~ x,
geom = "smooth",
color = "black",
show.legend = F)
yr_tmin_scatter_line
lm_prcp_yr <- lm(PRCP_TOT ~ YEAR, data = yearly_weather_100yr)
lm_prcp_month <- lm(PRCP_TOT ~ YEAR + MONTH, data = monthly_weather_100yr)
lm_prcp_avg <- lm(PRCP_AVG ~ YEAR + MONTH, data = monthly_weather_100yr)
lm_prcp_day <- lm(PRCP ~ YEAR, data = weather_100yr)
summary(lm_prcp_month)
##
## Call:
## lm(formula = PRCP_TOT ~ YEAR + MONTH, data = monthly_weather_100yr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.477 -1.469 -0.281 1.040 14.193
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.79929 3.09631 -2.84 0.00455 **
## YEAR 0.00625 0.00158 3.96 7.7e-05 ***
## MONTH02 -0.16779 0.27196 -0.62 0.53736
## MONTH03 0.58582 0.27196 2.15 0.03140 *
## MONTH04 0.42787 0.27196 1.57 0.11587
## MONTH05 0.44213 0.27196 1.63 0.10422
## MONTH06 0.34311 0.27196 1.26 0.20728
## MONTH07 0.87082 0.27196 3.20 0.00139 **
## MONTH08 0.98926 0.27196 3.64 0.00028 ***
## MONTH09 0.48172 0.27196 1.77 0.07672 .
## MONTH10 0.33508 0.27196 1.23 0.21811
## MONTH11 0.07344 0.27196 0.27 0.78716
## MONTH12 0.33918 0.27196 1.25 0.21253
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.12 on 1451 degrees of freedom
## Multiple R-squared: 0.0323, Adjusted R-squared: 0.0243
## F-statistic: 4.04 on 12 and 1451 DF, p-value: 3.29e-06
summary(lm_prcp_yr)
##
## Call:
## lm(formula = PRCP_TOT ~ YEAR, data = yearly_weather_100yr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.40 -5.59 -1.48 4.51 32.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -100.8708 42.4557 -2.38 0.01909 *
## YEAR 0.0750 0.0217 3.46 0.00074 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.42 on 120 degrees of freedom
## Multiple R-squared: 0.0909, Adjusted R-squared: 0.0833
## F-statistic: 12 on 1 and 120 DF, p-value: 0.00074
summary(lm_prcp_avg)
##
## Call:
## lm(formula = PRCP_AVG ~ YEAR + MONTH, data = monthly_weather_100yr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.1445 -0.0483 -0.0093 0.0345 0.4577
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.89e-01 1.01e-01 -2.85 0.00445 **
## YEAR 2.04e-04 5.16e-05 3.96 8e-05 ***
## MONTH02 4.94e-03 8.91e-03 0.55 0.57946
## MONTH03 1.89e-02 8.91e-03 2.12 0.03403 *
## MONTH04 1.80e-02 8.91e-03 2.02 0.04376 *
## MONTH05 1.43e-02 8.91e-03 1.60 0.10953
## MONTH06 1.51e-02 8.91e-03 1.70 0.08917 .
## MONTH07 2.81e-02 8.91e-03 3.15 0.00164 **
## MONTH08 3.19e-02 8.91e-03 3.58 0.00035 ***
## MONTH09 1.98e-02 8.91e-03 2.22 0.02660 *
## MONTH10 1.08e-02 8.91e-03 1.21 0.22510
## MONTH11 6.16e-03 8.91e-03 0.69 0.48926
## MONTH12 1.09e-02 8.91e-03 1.23 0.21948
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0696 on 1451 degrees of freedom
## Multiple R-squared: 0.0264, Adjusted R-squared: 0.0184
## F-statistic: 3.28 on 12 and 1451 DF, p-value: 0.000104
coef_prcp_yr <- c(coefficients(lm_prcp_yr), summary(lm_prcp_yr)$r.square)
coef_prcp_yr
## (Intercept) YEAR
## -100.8708 0.0750 0.0909
#coef_prcp_day <- coefficients(lm_tmax_day)
#coef_table_prcp <- cbind(coef_prcp_day, coef_prcp_yr)
#coef_table_prcp
The table above shows the slope and intercept of the TMAX linear model based on two different approaches.`
yr_prcp_scatter_line <- yearly_prcp_scatter+
stat_smooth(method = "lm",
formula = y ~ x,
geom = "smooth",
color = "black",
show.legend = F) +
labs(title = "Total Yearly Precipitation")
yr_prcp_scatter_line
lm_snow_yr <- lm(SNOW_TOT ~ YEAR, data = yearly_weather_100yr)
summary(lm_snow_yr)
##
## Call:
## lm(formula = SNOW_TOT ~ YEAR, data = yearly_weather_100yr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.36 -11.88 -1.58 7.99 33.79
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -14.5785 70.5341 -0.21 0.84
## YEAR 0.0211 0.0360 0.59 0.56
##
## Residual standard error: 14 on 120 degrees of freedom
## Multiple R-squared: 0.00286, Adjusted R-squared: -0.00545
## F-statistic: 0.344 on 1 and 120 DF, p-value: 0.559
coef_snow_yr <- coefficients(lm_snow_yr)
coef_snow_yr
## (Intercept) YEAR
## -14.5785 0.0211
The table above shows the slope and intercept of the TMAX linear
model based on two different approaches.coef_tmax_day uses
the raw daily TMAX for each year in the dataset while
coef_tmax_yr uses the yearly tmax averages.
yr_snow_scatter_line <- yearly_snow_scatter+
stat_smooth(method = "lm",
formula = y ~ x,
geom = "smooth",
color = "black",
show.legend = F)+
labs(title = "Total Yearly Snowfall")
yr_snow_scatter_line
grid.arrange(yr_prcp_scatter_line, yr_snow_scatter_line,
yr_tmax_scatter_line, yr_tmin_scatter_line,
nrow = 2,
top = "Weather Patterns in Central Park, NYC, from 1900-2021")
Do the observed changes in the weather data from New York City align with the documented rise in global air temperatures over the last century?
To compare the slope of TMAX from the NYC data set to the known global temp change, will look at confidence interval of the coefficients from the linear model.
weather1880 <- subset(monthly_weather, monthly_weather$YEAR > 1879 & monthly_weather$YEAR < 2022)
yearly_1880 <- subset(yearly_weather, yearly_weather$YEAR > 1879 & yearly_weather$YEAR < 2022)
yr1880_scatter <- ggplot(yearly_1880, aes(x = YEAR, y = TMAX)) +
geom_point() +
labs(x = "Year", y = "Temperature (F)", title = "Yearly Average Maximum Temp")
yr1880_scatter
yr1880_line <- yr1880_scatter +
stat_smooth(method = "lm",
formula = y ~ x,
geom = "smooth",
col = "black") +
labs(title = "NYC vs Global Temperature Change since 1880")
yr1880_line
yr1880_global <- yr1880_line +
geom_abline(aes(intercept = 33.2, slope = 0.014, col = "red")) +
scale_color_discrete(name = "Locale", labels = c("Global"))
yr1880_global
lm1880 <- lm(TMAX ~ YEAR + MONTH, data = weather1880)
lm_yr1880 <- lm(TMAX ~ YEAR, data = yearly_1880)
summary(lm1880)
##
## Call:
## lm(formula = TMAX ~ YEAR + MONTH, data = weather1880)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.470 -2.162 0.021 2.151 12.182
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -20.56997 3.82828 -5.37 8.8e-08 ***
## YEAR 0.03024 0.00196 15.45 < 2e-16 ***
## MONTH02 1.36947 0.39310 3.48 0.00051 ***
## MONTH03 9.67015 0.39310 24.60 < 2e-16 ***
## MONTH04 21.29680 0.39310 54.18 < 2e-16 ***
## MONTH05 32.36938 0.39310 82.34 < 2e-16 ***
## MONTH06 41.12661 0.39310 104.62 < 2e-16 ***
## MONTH07 46.02703 0.39310 117.09 < 2e-16 ***
## MONTH08 44.14562 0.39310 112.30 < 2e-16 ***
## MONTH09 37.49351 0.39310 95.38 < 2e-16 ***
## MONTH10 26.25897 0.39310 66.80 < 2e-16 ***
## MONTH11 14.47426 0.39310 36.82 < 2e-16 ***
## MONTH12 3.79827 0.39310 9.66 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.31 on 1691 degrees of freedom
## Multiple R-squared: 0.961, Adjusted R-squared: 0.961
## F-statistic: 3.51e+03 on 12 and 1691 DF, p-value: <2e-16
summary(lm_yr1880)
##
## Call:
## lm(formula = TMAX ~ YEAR, data = yearly_1880)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.168 -0.774 0.061 0.750 3.324
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.9014 5.0632 0.57 0.57
## YEAR 0.0301 0.0026 11.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.27 on 140 degrees of freedom
## Multiple R-squared: 0.491, Adjusted R-squared: 0.487
## F-statistic: 135 on 1 and 140 DF, p-value: <2e-16
weather1981 <- subset(monthly_weather, monthly_weather$YEAR > 1980 & monthly_weather$YEAR < 2022)
yearly_1981 <- subset(yearly_weather, yearly_weather$YEAR > 1980 & yearly_weather$YEAR < 2022)
all_1981 <- subset(weather_data, weather_data$YEAR > 1980 & weather_data$YEAR < 2022)
yr1981_scatter <- ggplot(yearly_1981, aes(x = YEAR, y = TMAX)) +
geom_point() +
labs(x = "Year", y = "Temperature (F)", title = "Yearly Average Maximum Temp")
yr1981_scatter
yr1981_line <- yr1981_scatter +
stat_smooth(method = "lm",
formula = y ~ x,
geom = "smooth",
col = "black") +
labs(title = "NYC vs Global Temperature Change since 1981")
yr1981_line
yr1981_global <- yr1981_line +
geom_abline(aes(intercept = -0.6, slope = 0.032, col = "red")) +
scale_color_discrete(name = "Locale", labels = c("Global"))
yr1981_global
lm1981 <- lm(TMAX ~ YEAR + MONTH, data = weather1981)
lm_yearly1981 <- lm(TMAX ~ YEAR, data = yearly_1981)
lm_all1981 <- lm(TMAX ~ YEAR + MONTH, data = all_1981)
summary(lm1981)
##
## Call:
## lm(formula = TMAX ~ YEAR + MONTH, data = weather1981)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.673 -1.923 -0.037 2.035 11.697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.1657 24.4478 0.74 0.46
## YEAR 0.0106 0.0122 0.86 0.39
## MONTH02 3.2210 0.7081 4.55 6.8e-06 ***
## MONTH03 11.1857 0.7081 15.80 < 2e-16 ***
## MONTH04 22.6684 0.7081 32.01 < 2e-16 ***
## MONTH05 32.3674 0.7081 45.71 < 2e-16 ***
## MONTH06 40.7887 0.7081 57.61 < 2e-16 ***
## MONTH07 45.8442 0.7081 64.75 < 2e-16 ***
## MONTH08 44.1841 0.7081 62.40 < 2e-16 ***
## MONTH09 37.0709 0.7081 52.36 < 2e-16 ***
## MONTH10 25.6436 0.7081 36.22 < 2e-16 ***
## MONTH11 15.1497 0.7081 21.40 < 2e-16 ***
## MONTH12 5.1487 0.7081 7.27 1.5e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.21 on 479 degrees of freedom
## Multiple R-squared: 0.962, Adjusted R-squared: 0.961
## F-statistic: 1e+03 on 12 and 479 DF, p-value: <2e-16
summary(lm_yearly1981)
##
## Call:
## lm(formula = TMAX ~ YEAR, data = yearly_1981)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.761 -0.615 0.189 0.885 2.746
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.5153 33.2338 1.25 0.22
## YEAR 0.0107 0.0166 0.65 0.52
##
## Residual standard error: 1.26 on 39 degrees of freedom
## Multiple R-squared: 0.0106, Adjusted R-squared: -0.0148
## F-statistic: 0.418 on 1 and 39 DF, p-value: 0.522
summary(lm_all1981)
##
## Call:
## lm(formula = TMAX ~ YEAR + MONTH, data = all_1981)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.76 -5.88 -0.23 5.63 35.55
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.53858 12.17530 1.44 0.150
## YEAR 0.01087 0.00608 1.79 0.074 .
## MONTH02 3.23265 0.35782 9.03 <2e-16 ***
## MONTH03 11.18568 0.34940 32.01 <2e-16 ***
## MONTH04 22.66842 0.35230 64.34 <2e-16 ***
## MONTH05 32.36743 0.34940 92.64 <2e-16 ***
## MONTH06 40.78875 0.35230 115.78 <2e-16 ***
## MONTH07 45.84422 0.34940 131.21 <2e-16 ***
## MONTH08 44.18411 0.34940 126.46 <2e-16 ***
## MONTH09 37.07086 0.35230 105.22 <2e-16 ***
## MONTH10 25.64359 0.34940 73.39 <2e-16 ***
## MONTH11 15.14972 0.35230 43.00 <2e-16 ***
## MONTH12 5.14870 0.34940 14.74 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.81 on 14962 degrees of freedom
## Multiple R-squared: 0.764, Adjusted R-squared: 0.764
## F-statistic: 4.04e+03 on 12 and 14962 DF, p-value: <2e-16
Are there any notable (and statistically significant) changes in weather patterns after/during the pandemic lockdown (perhaps due to changes in commuting patterns)?
covid_weather <- subset(monthly_weather, YEAR > 2010 & YEAR < 2021 &
(MONTH == "04" |MONTH == "05" | MONTH == "06" |MONTH == "07" |
MONTH == "08"))
covid_weather <- covid_weather %>%
add_column(Lockdown =
ifelse(.$YEAR < 2020, "Pre-Lockdown", "Post-Lockdown"),
.after = "YEAR")
covid_weather <- covid_weather %>%
add_column(Season =
ifelse(.$MONTH == "04" | .$MONTH == "05", "Spring", "Summer"),
.after = "YEAR")
covid_seasons <- aggregate(TMAX ~ Season + Lockdown, covid_weather, mean)
covid_seasons$Lockdown <- factor(covid_seasons$Lockdown, levels = c('Pre-Lockdown', 'Post-Lockdown'))
covid_bar2 <- ggplot(covid_seasons, aes(x = Season, y = TMAX, fill = Lockdown, decreasing = TRUE)) +
geom_col(position = "dodge") +
labs(title = "Seasonal Average Daily Rainfall Pre and Post COVID-19 Lockdown")
covid_bar2
covid_spring_summer <- subset(weather_data,
YEAR > 2010 &
YEAR <= 2020 &
(MONTH == "04" |MONTH == "05" | MONTH == "06" |
MONTH == "07" | MONTH == "08"))
covid_spring_summer_ld <- covid_spring_summer %>%
add_column(Lockdown =
ifelse(.$YEAR < 2020, "Pre-Lockdown", "Post-Lockdown"),
.after = "YEAR")
covid_spring_summer_ld$Lockdown <- factor(covid_spring_summer_ld$Lockdown, levels = c('Pre-Lockdown', 'Post-Lockdown'))
covid_ld_season <- covid_spring_summer_ld %>%
add_column(Season =
ifelse(.$MONTH == "04" | .$MONTH == "05", "Spring", "Summer"),
.after = "YEAR")
covid_boxplot <- ggplot()+
geom_boxplot(data = covid_ld_season, aes(y = TMAX, x = Season, fill = Lockdown)) +
labs(title = "Temperature Ranges Pre- and Post- COVID Lockdown")
covid_boxplot
pre_spring <- subset(weather_data,
YEAR > 2010 & YEAR < 2020 &
(MONTH == "04" |MONTH == "05"))
pre_summer <- subset(weather_data,
YEAR > 2010 & YEAR < 2020 &
(MONTH == "06" |MONTH == "07" |
MONTH == "08"))
covid_lockdown_spring <- subset(weather_data,
YEAR == 2020 &
(MONTH == "04" |MONTH == "05"))
covid_lockdown_summer <- subset(weather_data,
YEAR == 2020 &
(MONTH == "06" |MONTH == "07" |
MONTH == "08"))
cov_spring_ttest <- t.test(pre_spring$TMAX, covid_lockdown_spring$TMAX)
cov_spring_ttest
##
## Welch Two Sample t-test
##
## data: pre_spring$TMAX and covid_lockdown_spring$TMAX
## t = 3, df = 78, p-value = 0.005
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.15 6.33
## sample estimates:
## mean of x mean of y
## 67.3 63.5
cov_summer_ttest <- t.test(pre_summer$TMAX, covid_lockdown_summer$TMAX)
cov_summer_ttest
##
## Welch Two Sample t-test
##
## data: pre_summer$TMAX and covid_lockdown_summer$TMAX
## t = -2, df = 119, p-value = 0.05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.4951 0.0265
## sample estimates:
## mean of x mean of y
## 83.1 84.4
#loading packages and data
NYweath <- data.frame(read.csv("data/NYC_weather_1869_2022.csv"))
xkablesummary(NYweath)
| STATION | NAME | LATITUDE | LONGITUDE | ELEVATION | DATE | ACMH | ACSH | AWND | DAEV | DASF | DAWM | EVAP | FMTM | MDEV | MDSF | MDWM | PGTM | PRCP | PSUN | SNOW | SNWD | TAVG | TMAX | TMIN | TOBS | TSUN | WDF1 | WDF2 | WDF5 | WDFG | WDFM | WDMV | WESD | WSF1 | WSF2 | WSF5 | WSFG | WSFM | WT01 | WT02 | WT03 | WT04 | WT05 | WT06 | WT07 | WT08 | WT09 | WT11 | WT13 | WT14 | WT15 | WT16 | WT17 | WT18 | WT19 | WT21 | WT22 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | Length:56093 | Length:56093 | Min. :40.8 | Min. :-74 | Min. :42.7 | Length:56093 | Min. : 0 | Min. : 0 | Min. : 0 | Min. :2 | Min. :2 | Min. :2 | Min. :0 | Min. : 0 | Min. :0 | Min. : 0 | Min. : 7 | Min. : 0 | Min. :0.00 | Min. : 0 | Min. : 0.0 | Min. : 0 | Min. : 0 | Min. : 2.0 | Min. :-15.0 | Min. :60 | Min. : 0 | Min. : 10 | Min. : 0 | Min. : 0 | Min. : 0 | Min. : 23 | Min. : 0 | Min. :0 | Min. : 1 | Min. : 0 | Min. : 0 | Min. : 0 | Min. : 6 | Min. :1 | Min. :1 | Min. :1 | Min. :1 | Min. :1 | Min. :1 | Min. :1 | Min. :1 | Min. :1 | Min. :1 | Min. :1 | Min. :1 | Min. :1 | Min. :1 | Min. :1 | Min. :1 | Min. :1 | Min. :1 | Min. :1 |
| Q1 | Class :character | Class :character | 1st Qu.:40.8 | 1st Qu.:-74 | 1st Qu.:42.7 | Class :character | 1st Qu.: 30 | 1st Qu.: 20 | 1st Qu.: 4 | 1st Qu.:2 | 1st Qu.:2 | 1st Qu.:3 | 1st Qu.:0 | 1st Qu.: 1045 | 1st Qu.:0 | 1st Qu.: 2 | 1st Qu.: 57 | 1st Qu.: 952 | 1st Qu.:0.00 | 1st Qu.: 18 | 1st Qu.: 0.0 | 1st Qu.: 0 | 1st Qu.:43 | 1st Qu.: 46.0 | 1st Qu.: 34.0 | 1st Qu.:60 | 1st Qu.: 48 | 1st Qu.:110 | 1st Qu.: 80 | 1st Qu.:100 | 1st Qu.:135 | 1st Qu.:135 | 1st Qu.: 16 | 1st Qu.:0 | 1st Qu.:12 | 1st Qu.:12 | 1st Qu.:17 | 1st Qu.:17 | 1st Qu.:13 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 | 1st Qu.:1 |
| Median | Mode :character | Mode :character | Median :40.8 | Median :-74 | Median :42.7 | Mode :character | Median : 50 | Median : 50 | Median : 6 | Median :3 | Median :2 | Median :3 | Median :0 | Median : 1450 | Median :0 | Median : 4 | Median : 84 | Median :1440 | Median :0.00 | Median : 63 | Median : 0.0 | Median : 0 | Median :57 | Median : 63.0 | Median : 47.0 | Median :61 | Median : 420 | Median :230 | Median :240 | Median :230 | Median :225 | Median :225 | Median : 26 | Median :0 | Median :14 | Median :14 | Median :22 | Median :21 | Median :16 | Median :1 | Median :1 | Median :1 | Median :1 | Median :1 | Median :1 | Median :1 | Median :1 | Median :1 | Median :1 | Median :1 | Median :1 | Median :1 | Median :1 | Median :1 | Median :1 | Median :1 | Median :1 | Median :1 |
| Mean | NA | NA | Mean :40.8 | Mean :-74 | Mean :42.7 | NA | Mean : 52 | Mean : 52 | Mean : 6 | Mean :3 | Mean :2 | Mean :3 | Mean :0 | Mean : 1445 | Mean :0 | Mean : 5 | Mean : 99 | Mean :1364 | Mean :0.12 | Mean : 56 | Mean : 0.1 | Mean : 0 | Mean :56 | Mean : 61.5 | Mean : 46.8 | Mean :61 | Mean : 395 | Mean :196 | Mean :199 | Mean :202 | Mean :209 | Mean :212 | Mean : 31 | Mean :0 | Mean :14 | Mean :15 | Mean :23 | Mean :22 | Mean :17 | Mean :1 | Mean :1 | Mean :1 | Mean :1 | Mean :1 | Mean :1 | Mean :1 | Mean :1 | Mean :1 | Mean :1 | Mean :1 | Mean :1 | Mean :1 | Mean :1 | Mean :1 | Mean :1 | Mean :1 | Mean :1 | Mean :1 |
| Q3 | NA | NA | 3rd Qu.:40.8 | 3rd Qu.:-74 | 3rd Qu.:42.7 | NA | 3rd Qu.: 80 | 3rd Qu.: 80 | 3rd Qu.: 8 | 3rd Qu.:3 | 3rd Qu.:2 | 3rd Qu.:3 | 3rd Qu.:0 | 3rd Qu.: 1850 | 3rd Qu.:1 | 3rd Qu.: 7 | 3rd Qu.:129 | 3rd Qu.:1834 | 3rd Qu.:0.05 | 3rd Qu.: 92 | 3rd Qu.: 0.0 | 3rd Qu.: 0 | 3rd Qu.:71 | 3rd Qu.: 78.0 | 3rd Qu.: 62.0 | 3rd Qu.:62 | 3rd Qu.: 641 | 3rd Qu.:280 | 3rd Qu.:290 | 3rd Qu.:290 | 3rd Qu.:270 | 3rd Qu.:315 | 3rd Qu.: 40 | 3rd Qu.:0 | 3rd Qu.:16 | 3rd Qu.:17 | 3rd Qu.:26 | 3rd Qu.:25 | 3rd Qu.:20 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 | 3rd Qu.:1 |
| Max | NA | NA | Max. :40.8 | Max. :-74 | Max. :42.7 | NA | Max. :100 | Max. :100 | Max. :23 | Max. :4 | Max. :2 | Max. :4 | Max. :7 | Max. :32767 | Max. :1 | Max. :40 | Max. :294 | Max. :2359 | Max. :8.28 | Max. :100 | Max. :27.3 | Max. :26 | Max. :92 | Max. :106.0 | Max. : 87.0 | Max. :62 | Max. :2352 | Max. :760 | Max. :360 | Max. :360 | Max. :360 | Max. :360 | Max. :178 | Max. :1 | Max. :51 | Max. :40 | Max. :62 | Max. :64 | Max. :52 | Max. :1 | Max. :1 | Max. :1 | Max. :1 | Max. :1 | Max. :1 | Max. :1 | Max. :1 | Max. :1 | Max. :1 | Max. :1 | Max. :1 | Max. :1 | Max. :1 | Max. :1 | Max. :1 | Max. :1 | Max. :1 | Max. :1 |
| NA | NA | NA | NA | NA | NA | NA | NA’s :55362 | NA’s :55363 | NA’s :46312 | NA’s :55964 | NA’s :56037 | NA’s :55971 | NA’s :54028 | NA’s :46050 | NA’s :55964 | NA’s :56037 | NA’s :55971 | NA’s :47015 | NA | NA’s :51300 | NA’s :163 | NA’s :16499 | NA’s :53445 | NA’s :7 | NA’s :7 | NA’s :56091 | NA’s :46878 | NA’s :51751 | NA’s :46562 | NA’s :46614 | NA’s :51735 | NA’s :49933 | NA’s :53956 | NA’s :56081 | NA’s :51747 | NA’s :46562 | NA’s :46576 | NA’s :51710 | NA’s :49933 | NA’s :52361 | NA’s :55746 | NA’s :55363 | NA’s :55943 | NA’s :55766 | NA’s :55950 | NA’s :56001 | NA’s :54239 | NA’s :56072 | NA’s :56088 | NA’s :53769 | NA’s :55770 | NA’s :56073 | NA’s :49545 | NA’s :56026 | NA’s :54775 | NA’s :55914 | NA’s :56092 | NA’s :56008 |
nrow(NYweath)
[1] 56093
#converting to R date format and adding columns for day, month, and year
NYweath$DATE <- as.Date(NYweath$DATE)
NYweath$day <- format(NYweath$DATE, format="%d")
NYweath$month <- format(NYweath$DATE, format="%m")
NYweath$year <- format(NYweath$DATE, format="%Y")
#converting temperature observations to numerics
NYweath$TMAX <- as.numeric(NYweath$TMAX)
NYweath$TMIN <- as.numeric(NYweath$TMIN)
NYweath$TAVG <- as.numeric(NYweath$TAVG)
NYweath$year <- as.numeric(NYweath$year)
#Making month a factor
NYweath$month <- as.factor(NYweath$month)
#creating a subset of the weather data that are only date- and temperature-related
NYweath_sub <- subset(NYweath, select = c(DATE, day, month, year, TMAX, TMIN, TAVG, PRCP, SNOW))
xkablesummary(NYweath_sub)
| DATE | day | month | year | TMAX | TMIN | TAVG | PRCP | SNOW | |
|---|---|---|---|---|---|---|---|---|---|
| Min | Min. :1869-02-28 | Length:56093 | 03 : 4774 | Min. :1869 | Min. : 2.0 | Min. :-15.0 | Min. : 0 | Min. :0.00 | Min. : 0.0 |
| Q1 | 1st Qu.:1907-07-23 | Class :character | 05 : 4774 | 1st Qu.:1907 | 1st Qu.: 46.0 | 1st Qu.: 34.0 | 1st Qu.:43 | 1st Qu.:0.00 | 1st Qu.: 0.0 |
| Median | Median :1945-12-13 | Mode :character | 07 : 4774 | Median :1945 | Median : 63.0 | Median : 47.0 | Median :57 | Median :0.00 | Median : 0.0 |
| Mean | Mean :1945-12-13 | NA | 08 : 4774 | Mean :1945 | Mean : 61.5 | Mean : 46.8 | Mean :56 | Mean :0.12 | Mean : 0.1 |
| Q3 | 3rd Qu.:1984-05-05 | NA | 01 : 4743 | 3rd Qu.:1984 | 3rd Qu.: 78.0 | 3rd Qu.: 62.0 | 3rd Qu.:71 | 3rd Qu.:0.05 | 3rd Qu.: 0.0 |
| Max | Max. :2022-09-26 | NA | 10 : 4743 | Max. :2022 | Max. :106.0 | Max. : 87.0 | Max. :92 | Max. :8.28 | Max. :27.3 |
| NA | NA | NA | (Other):27511 | NA | NA’s :7 | NA’s :7 | NA’s :53445 | NA | NA’s :163 |
Let’s try to plot the max temp recorded each day in Central Park over time.
NYweath_sub %>%
ggplot(aes(x = DATE, y = TMAX)) +
geom_point() +
ggtitle("Minimum Daily Temperature in Central Park")
And the minimum.
NYweath_sub %>%
ggplot(aes(x = DATE, y = TMIN)) +
geom_point() +
ggtitle("Minimum Daily Temperature in Central Park")
What a mess! Wait, of course this will look like a mess, because temperature oscillates with the seasons! Let’s make sure this oscillation is present in the data by looking at just a 2-year window of minimum daily temperatures from 2018 to the present.
Recent_weath <- subset(NYweath_sub, year > 2017)
Recent_weath %>%
ggplot(aes(x = DATE, y = TMIN)) +
geom_point() +
ggtitle("Minimum Daily Temperature in Central Park")
How reassuring! Now, let’s look at the maximum daily temperatures.
Recent_weath <- subset(NYweath_sub, year > 2017)
Recent_weath %>%
ggplot(aes(x = DATE, y = TMAX)) +
geom_point() +
ggtitle("Maximum Daily Temperature in Central Park")
Cool. We could fit these data to sine curves to determine the amplitudes or center points of oscillation along with confidence intervals, and compare them to the results for way earlier in the 20th or 19th centuries, to see if there are statistical differences in the curve fit parameters across the 100-year window.
What about trends in daily temperature range (daily high minus daily low)?
Recent_weath$rang <- (Recent_weath$TMAX - Recent_weath$TMIN)
Recent_weath %>%
ggplot(aes(x = DATE, y = rang)) +
geom_point() +
labs(
title = "Daily Temperature Range in Central Park",
x = "Date",
y = "Daily high minus low temp")
Interesting… I do see a trend here too: the greatest daily variation in temperature in Central Park appears to have some periodicity. We could compute the average daily temperature variation for each month, perhaps over the 100 year interval, to see if some months have statistically higher daily temperature variations.
maxTfit0 <- lm(formula = TMAX ~ year, data = NYweath_sub )
summary(maxTfit0)
##
## Call:
## lm(formula = TMAX ~ year, data = NYweath_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.64 -15.42 0.92 16.27 44.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.70280 3.46245 0.2 0.84
## year 0.03127 0.00178 17.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.7 on 56084 degrees of freedom
## (7 observations deleted due to missingness)
## Multiple R-squared: 0.00548, Adjusted R-squared: 0.00546
## F-statistic: 309 on 1 and 56084 DF, p-value: <2e-16
Trying a fit on TMAX for all data using month as a factor variable.
maxTfit1 <- lm(formula = TMAX ~ year + month, data = NYweath_sub )
summary(maxTfit1)
##
## Call:
## lm(formula = TMAX ~ year + month, data = NYweath_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.08 -5.87 -0.14 5.65 38.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.33e+01 1.65e+00 -14.08 < 2e-16 ***
## year 3.16e-02 8.47e-04 37.36 < 2e-16 ***
## month02 1.38e+00 1.87e-01 7.37 1.8e-13 ***
## month03 9.55e+00 1.82e-01 52.37 < 2e-16 ***
## month04 2.12e+01 1.84e-01 115.13 < 2e-16 ***
## month05 3.23e+01 1.82e-01 177.34 < 2e-16 ***
## month06 4.11e+01 1.84e-01 223.89 < 2e-16 ***
## month07 4.61e+01 1.82e-01 252.71 < 2e-16 ***
## month08 4.41e+01 1.82e-01 242.21 < 2e-16 ***
## month09 3.74e+01 1.84e-01 203.33 < 2e-16 ***
## month10 2.62e+01 1.83e-01 143.63 < 2e-16 ***
## month11 1.43e+01 1.84e-01 77.52 < 2e-16 ***
## month12 3.71e+00 1.83e-01 20.29 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.89 on 56073 degrees of freedom
## (7 observations deleted due to missingness)
## Multiple R-squared: 0.775, Adjusted R-squared: 0.775
## F-statistic: 1.61e+04 on 12 and 56073 DF, p-value: <2e-16
The multiple r-squared (and adjusted r-squared) fits are much better here, at 0.775. This may simply be because there are many more data points than when using monthly averages. So– that full mess of data was all useful!
Here are the equations:
January: -23.3 + 0.0316 year
February: -23.3+ 1.38 + 0.0316 year
March: -23.3 + 9.55 + 0.0316 year
April: -23.3 + 21.2 + 0.0316 year
May: -23.3 + 32.3 + 0.0316 year
June: -23.3 + 41.1 + 0.0316 year
July: -23.3 + 46.1 + 0.0316 year
August: -23.3 + 44.1 + 0.0316 year
September: -23.3 + 37.4 + 0.0316 year
October: -23.3 + 26.2 + 0.0316 year
November: -23.3 + 14.1 + 0.0316 year
December: -23.3 + 3.71 + 0.0316 year
On average, maximum daily temperatures in this location in central park have been increasing by 0.0316 degrees Fahrenheit per year over the time frame of the data set (1869 to 2022).
maxTfit2 <- lm(formula = TMAX ~ year*month, data = NYweath_sub )
summary(maxTfit2)
##
## Call:
## lm(formula = TMAX ~ year * month, data = NYweath_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.88 -5.86 -0.13 5.64 38.20
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.99525 5.67942 0.53 0.59793
## year 0.01814 0.00292 6.22 5.1e-10 ***
## month02 -48.49021 8.22446 -5.90 3.7e-09 ***
## month03 -60.34742 7.99196 -7.55 4.4e-14 ***
## month04 -37.07059 8.05762 -4.60 4.2e-06 ***
## month05 19.85288 8.00082 2.48 0.01309 *
## month06 47.84350 8.05762 5.94 2.9e-09 ***
## month07 39.66717 7.99196 4.96 6.9e-07 ***
## month08 22.68921 7.99196 2.84 0.00453 **
## month09 26.84293 8.06269 3.33 0.00087 ***
## month10 16.26197 8.02986 2.03 0.04285 *
## month11 -31.06543 8.09646 -3.84 0.00012 ***
## month12 -36.39448 8.02986 -4.53 5.8e-06 ***
## year:month02 0.02563 0.00423 6.06 1.3e-09 ***
## year:month03 0.03592 0.00411 8.75 < 2e-16 ***
## year:month04 0.02993 0.00414 7.23 4.9e-13 ***
## year:month05 0.00641 0.00411 1.56 0.11870
## year:month06 -0.00344 0.00414 -0.83 0.40539
## year:month07 0.00328 0.00411 0.80 0.42388
## year:month08 0.01103 0.00411 2.69 0.00725 **
## year:month09 0.00541 0.00414 1.31 0.19154
## year:month10 0.00511 0.00413 1.24 0.21515
## year:month11 0.02330 0.00416 5.60 2.1e-08 ***
## year:month12 0.02061 0.00413 4.99 5.9e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.87 on 56062 degrees of freedom
## (7 observations deleted due to missingness)
## Multiple R-squared: 0.776, Adjusted R-squared: 0.775
## F-statistic: 8.42e+03 on 23 and 56062 DF, p-value: <2e-16
The equations, including interaction terms, are:
January: -2.998 + 0.01814 year
February: -48.49 + -2.998 (0.02563 + 0.01814) year
March: -60.35 + -2.998 + (0.03592 + 0.01814) year
April: -37.07 + -2.998 + (0.02993 + 0.01814) year
May: 19.85 + -2.998 + (0.00641 + 0.01814) year
June: 47.84 + -2.998 + (0.02563 + 0.01814) year
July: 39.37 + -2.998 + (0.0328 + 0.01814) year
August: 22.69 + -2.998 + (0.01103 + 0.01814) year
September: 26.84 + -2.998 + (0.00541 + 0.01814) year
October: 16.26 + -2.998 + (0.00511 + 0.01814) year
November: -31.07 + -2.998 + (0.02330 + 0.01814) year
December: -36.39 + -2.998 + (0.02061 + 0.01814) year
Essentially the same r-squared here. The interaction terms are significant for February, March, April, August, November, and December, meaning that the change in temperature over time is likely different for these months. The warming is greatest in February, March, April, November, and December.
Now, let’s subset to 1900 to 2022.
NYweath_00 <- subset (NYweath_sub, year > 1899) # subsetting to 20th century and later
xkabledplyhead(NYweath_00)
| DATE | day | month | year | TMAX | TMIN | TAVG | PRCP | SNOW | |
|---|---|---|---|---|---|---|---|---|---|
| 11265 | 1900-01-01 | 01 | 01 | 1900 | 20 | 14 | NA | 0.03 | 0.5 |
| 11266 | 1900-01-02 | 02 | 01 | 1900 | 23 | 13 | NA | 0.00 | 0.0 |
| 11267 | 1900-01-03 | 03 | 01 | 1900 | 26 | 19 | NA | 0.00 | 0.0 |
| 11268 | 1900-01-04 | 04 | 01 | 1900 | 32 | 19 | NA | 0.00 | 0.0 |
| 11269 | 1900-01-05 | 05 | 01 | 1900 | 39 | 29 | NA | 0.00 | 0.0 |
maxTfit00_0 <- lm(formula = TMAX ~ year, data = NYweath_00 )
summary(maxTfit00_0)
##
## Call:
## lm(formula = TMAX ~ year, data = NYweath_00)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.95 -15.34 1.04 16.22 44.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.80092 4.86335 2.22 0.026 *
## year 0.02616 0.00248 10.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.6 on 44827 degrees of freedom
## Multiple R-squared: 0.00248, Adjusted R-squared: 0.00245
## F-statistic: 111 on 1 and 44827 DF, p-value: <2e-16
maxTfit00_1 <- lm(formula = TMAX ~ year + month, data = NYweath_00 )
summary(maxTfit00_1)
##
## Call:
## lm(formula = TMAX ~ year + month, data = NYweath_00)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.37 -5.87 -0.17 5.62 37.48
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11.05508 2.32357 -4.76 2.0e-06 ***
## year 0.02539 0.00118 21.47 < 2e-16 ***
## month02 1.41719 0.20808 6.81 9.8e-12 ***
## month03 10.18568 0.20318 50.13 < 2e-16 ***
## month04 21.47157 0.20487 104.81 < 2e-16 ***
## month05 32.31209 0.20318 159.03 < 2e-16 ***
## month06 40.86425 0.20487 199.47 < 2e-16 ***
## month07 46.03803 0.20318 226.59 < 2e-16 ***
## month08 44.18752 0.20318 217.48 < 2e-16 ***
## month09 37.47104 0.20492 182.86 < 2e-16 ***
## month10 26.49418 0.20360 130.13 < 2e-16 ***
## month11 14.62180 0.20529 71.22 < 2e-16 ***
## month12 3.74193 0.20360 18.38 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.87 on 44816 degrees of freedom
## Multiple R-squared: 0.773, Adjusted R-squared: 0.773
## F-statistic: 1.27e+04 on 12 and 44816 DF, p-value: <2e-16
maxTfit00_2 <- lm(formula = TMAX ~ year * month, data = NYweath_00 )
summary(maxTfit00_2)
##
## Call:
## lm(formula = TMAX ~ year * month, data = NYweath_00)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.62 -5.86 -0.18 5.60 37.68
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.97638 7.92089 3.03 0.00247 **
## year 0.00753 0.00404 1.86 0.06229 .
## month02 -91.34608 11.47314 -7.96 1.7e-15 ***
## month03 -49.69861 11.20184 -4.44 9.2e-06 ***
## month04 -54.37296 11.29480 -4.81 1.5e-06 ***
## month05 6.04397 11.20184 0.54 0.58951
## month06 22.67663 11.29480 2.01 0.04468 *
## month07 31.67848 11.20184 2.83 0.00469 **
## month08 8.40307 11.20184 0.75 0.45317
## month09 31.76679 11.30382 2.81 0.00495 **
## month10 40.24554 11.26959 3.57 0.00036 ***
## month11 -26.86867 11.36423 -2.36 0.01807 *
## month12 -64.98949 11.26959 -5.77 8.1e-09 ***
## year:month02 0.04730 0.00585 8.09 6.3e-16 ***
## year:month03 0.03054 0.00571 5.35 9.0e-08 ***
## year:month04 0.03868 0.00576 6.72 1.9e-11 ***
## year:month05 0.01340 0.00571 2.35 0.01901 *
## year:month06 0.00927 0.00576 1.61 0.10729
## year:month07 0.00732 0.00571 1.28 0.19981
## year:month08 0.01825 0.00571 3.20 0.00140 **
## year:month09 0.00291 0.00576 0.50 0.61383
## year:month10 -0.00702 0.00575 -1.22 0.22195
## year:month11 0.02116 0.00579 3.65 0.00026 ***
## year:month12 0.03505 0.00575 6.10 1.1e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.85 on 44805 degrees of freedom
## Multiple R-squared: 0.774, Adjusted R-squared: 0.774
## F-statistic: 6.68e+03 on 23 and 44805 DF, p-value: <2e-16
ggplot(NYweath_00, aes(x = DATE, y = TMAX, color = month)) +
geom_point(alpha = 0.6) +
scale_color_manual(values = c("01" = "purple4",
"02" = "purple",
"03" = "dodgerblue",
"04" = "cyan4",
"05" = "yellow",
"06" = "darkgoldenrod1",
"07" = "red",
"08" = "orange",
"09" = "yellow",
"10" = "green",
"11" = "turquoise",
"12" = "steelblue")) +
labs(
x = "Year",
y = "Maximum Daily Temperature",
title = "Maximum Daily Temperature in Central Park") +
xlab(label = "Year") +
ylab(label = "Maximum daily temperature") +
ggtitle(label = "Maximum Daily Temperature in Central Park") +
stat_smooth(method = "lm",
formula = y ~ x,
geom = "smooth",
color = "black",
show.legend = F)
## Plotting the same data, but removing all but January and July
ggplot(NYweath_00, aes(x = year, y = TMAX, color = month)) +
geom_point() +
scale_color_manual(values = c("01" = "purple4",
"07" = "red"), na.value = NA) +
geom_abline(aes(intercept = -11.05508, slope = 0.02539), col = "black", size = 1) +
geom_abline(aes(intercept = 34.98295, slope = 0.02539), col = "black", size = 1) +
labs(
x = "Year",
y = "Maximum Daily Temperature",
title = "Maximum Daily Temperature in Central Park") +
xlab(label = "Year") +
ylab(label = "Maximum daily temperature") +
ggtitle(label = "Maximum Daily Temperature in Central Park")
# stat_smooth(method = "lm",
# formula = y ~ x,
# geom = "smooth",
# color = "black",
# show.legend = F)
ggplot(NYweath_00, aes(x = DATE, y = TMAX, color = month)) +
geom_point(alpha = 0.7) +
scale_color_manual(values = c("01" = "gray35",
"02" = "gray35",
"03" = "gray35",
"04" = "gray35",
"05" = "gray35",
"06" = "gray35",
"07" = "gray35",
"08" = "gray35",
"09" = "gray35",
"10" = "gray35",
"11" = "gray35",
"12" = "gray35")) +
labs(
x = "Year",
y = "Maximum Daily Temperature",
title = "Maximum Daily Temperature in Central Park") +
xlab(label = "Year") +
ylab(label = "Maximum daily temperature") +
ggtitle(label = "Maximum Daily Temperature in Central Park") +
stat_smooth(method = "lm",
formula = y ~ x,
geom = "smooth",
color = "black",
show.legend = F)
NYweath_48 <- subset (NYweath_sub, year > 1947) # subsetting to 1948 and later for comparison with JFK data
xkabledplyhead(NYweath_48)
| DATE | day | month | year | TMAX | TMIN | TAVG | PRCP | SNOW | |
|---|---|---|---|---|---|---|---|---|---|
| 28796 | 1948-01-01 | 01 | 01 | 1948 | 32 | 29 | NA | 0.71 | 0.1 |
| 28797 | 1948-01-02 | 02 | 01 | 1948 | 34 | 30 | NA | 0.87 | 0.4 |
| 28798 | 1948-01-03 | 03 | 01 | 1948 | 34 | 26 | NA | 0.00 | 0.0 |
| 28799 | 1948-01-04 | 04 | 01 | 1948 | 35 | 25 | NA | 0.02 | 0.1 |
| 28800 | 1948-01-05 | 05 | 01 | 1948 | 37 | 32 | NA | 0.00 | 0.0 |
maxTfit48_0 <- lm(formula = TMAX ~ year, data = NYweath_48 )
summary(maxTfit48_0)
##
## Call:
## lm(formula = TMAX ~ year, data = NYweath_48)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.73 -15.14 1.02 16.07 41.40
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.59725 10.28369 2.98 0.0029 **
## year 0.01619 0.00518 3.12 0.0018 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.5 on 27296 degrees of freedom
## Multiple R-squared: 0.000358, Adjusted R-squared: 0.000321
## F-statistic: 9.76 on 1 and 27296 DF, p-value: 0.00178
maxTfit48_1 <- lm(formula = TMAX ~ year + month, data = NYweath_48 )
summary(maxTfit48_1)
##
## Call:
## lm(formula = TMAX ~ year + month, data = NYweath_48)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.44 -5.89 -0.16 5.59 36.11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.92079 4.92933 2.22 0.027 *
## year 0.01406 0.00248 5.67 1.5e-08 ***
## month02 2.69999 0.26567 10.16 < 2e-16 ***
## month03 10.87570 0.25944 41.92 < 2e-16 ***
## month04 22.65256 0.26159 86.60 < 2e-16 ***
## month05 32.68774 0.25944 126.00 < 2e-16 ***
## month06 41.24278 0.26159 157.66 < 2e-16 ***
## month07 46.40688 0.25944 178.88 < 2e-16 ***
## month08 44.77290 0.25944 172.58 < 2e-16 ***
## month09 37.54036 0.26171 143.44 < 2e-16 ***
## month10 26.49689 0.26031 101.79 < 2e-16 ***
## month11 15.31521 0.26249 58.35 < 2e-16 ***
## month12 4.62418 0.26031 17.76 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.85 on 27285 degrees of freedom
## Multiple R-squared: 0.771, Adjusted R-squared: 0.771
## F-statistic: 7.64e+03 on 12 and 27285 DF, p-value: <2e-16
maxTfit48_2 <- lm(formula = TMAX ~ year*month, data = NYweath_48 )
summary(maxTfit48_2)
##
## Call:
## lm(formula = TMAX ~ year * month, data = NYweath_48)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.41 -5.90 -0.18 5.61 35.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.45328 16.80071 0.38 0.70090
## year 0.01631 0.00846 1.93 0.05396 .
## month02 -34.24256 24.32884 -1.41 0.15929
## month03 -53.57555 23.75979 -2.25 0.02415 *
## month04 4.05564 23.95697 0.17 0.86557
## month05 43.95305 23.75979 1.85 0.06434 .
## month06 88.80876 23.95697 3.71 0.00021 ***
## month07 87.58973 23.75979 3.69 0.00023 ***
## month08 64.73312 23.75979 2.72 0.00644 **
## month09 52.56168 23.98838 2.19 0.02845 *
## month10 112.18612 23.99956 4.67 3e-06 ***
## month11 37.04712 24.20267 1.53 0.12585
## month12 -65.33324 23.99956 -2.72 0.00649 **
## year:month02 0.01861 0.01226 1.52 0.12888
## year:month03 0.03247 0.01197 2.71 0.00668 **
## year:month04 0.00937 0.01207 0.78 0.43757
## year:month05 -0.00568 0.01197 -0.47 0.63539
## year:month06 -0.02396 0.01207 -1.99 0.04709 *
## year:month07 -0.02075 0.01197 -1.73 0.08304 .
## year:month08 -0.01006 0.01197 -0.84 0.40084
## year:month09 -0.00757 0.01208 -0.63 0.53117
## year:month10 -0.04318 0.01209 -3.57 0.00036 ***
## year:month11 -0.01095 0.01219 -0.90 0.36918
## year:month12 0.03525 0.01209 2.92 0.00355 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.83 on 27274 degrees of freedom
## Multiple R-squared: 0.771, Adjusted R-squared: 0.771
## F-statistic: 4e+03 on 23 and 27274 DF, p-value: <2e-16
I wonder if we’ll get something different if we look at weather in a more built environment. Starting a new RMD file for this (JFK airport).
JFKweath <- data.frame(read.csv("data/JFK_weather_1949-2022.csv"))
xkablesummary(JFKweath)
| STATION | NAME | LATITUDE | LONGITUDE | ELEVATION | DATE | ACMH | ACMH_ATTRIBUTES | ACSH | ACSH_ATTRIBUTES | AWND | AWND_ATTRIBUTES | DAPR | DAPR_ATTRIBUTES | EVAP | EVAP_ATTRIBUTES | FMTM | FMTM_ATTRIBUTES | MDPR | MDPR_ATTRIBUTES | PGTM | PGTM_ATTRIBUTES | PRCP | PRCP_ATTRIBUTES | SNOW | SNOW_ATTRIBUTES | SNWD | SNWD_ATTRIBUTES | TAVG | TAVG_ATTRIBUTES | TMAX | TMAX_ATTRIBUTES | TMIN | TMIN_ATTRIBUTES | TOBS | TOBS_ATTRIBUTES | TSUN | TSUN_ATTRIBUTES | WDF1 | WDF1_ATTRIBUTES | WDF2 | WDF2_ATTRIBUTES | WDF5 | WDF5_ATTRIBUTES | WDFG | WDFG_ATTRIBUTES | WDFM | WDFM_ATTRIBUTES | WDMV | WDMV_ATTRIBUTES | WESD | WESD_ATTRIBUTES | WSF1 | WSF1_ATTRIBUTES | WSF2 | WSF2_ATTRIBUTES | WSF5 | WSF5_ATTRIBUTES | WSFG | WSFG_ATTRIBUTES | WSFM | WSFM_ATTRIBUTES | WT01 | WT01_ATTRIBUTES | WT02 | WT02_ATTRIBUTES | WT03 | WT03_ATTRIBUTES | WT04 | WT04_ATTRIBUTES | WT05 | WT05_ATTRIBUTES | WT06 | WT06_ATTRIBUTES | WT07 | WT07_ATTRIBUTES | WT08 | WT08_ATTRIBUTES | WT09 | WT09_ATTRIBUTES | WT11 | WT11_ATTRIBUTES | WT13 | WT13_ATTRIBUTES | WT14 | WT14_ATTRIBUTES | WT15 | WT15_ATTRIBUTES | WT16 | WT16_ATTRIBUTES | WT17 | WT17_ATTRIBUTES | WT18 | WT18_ATTRIBUTES | WT21 | WT21_ATTRIBUTES | WT22 | WT22_ATTRIBUTES | WV01 | WV01_ATTRIBUTES | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | Length:27104 | Length:27104 | Min. :40.6 | Min. :-73.8 | Min. :2.7 | Length:27104 | Min. : 0 | Length:27104 | Min. : 0 | Length:27104 | Min. : 0 | Length:27104 | Min. :16 | Length:27104 | Mode:logical | Mode:logical | Min. : 0 | Length:27104 | Min. : 0 | Length:27104 | Min. : 0 | Length:27104 | Min. :0 | Length:27104 | Min. : 0 | Length:27104 | Min. : 0 | Length:27104 | Min. : 8 | Length:27104 | Min. : 8.0 | Length:27104 | Min. :-2.0 | Length:27104 | Mode:logical | Mode:logical | Min. : 0 | Length:27104 | Min. : 10 | Length:27104 | Min. : 10 | Length:27104 | Min. : 0 | Length:27104 | Min. : 23 | Length:27104 | Min. :360 | Length:27104 | Mode:logical | Mode:logical | Min. :0 | Length:27104 | Min. : 6 | Length:27104 | Min. : 7 | Length:27104 | Min. : 0 | Length:27104 | Min. : 8 | Length:27104 | Min. : 9 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 | Min. :1 | Length:27104 |
| Q1 | Class :character | Class :character | 1st Qu.:40.6 | 1st Qu.:-73.8 | 1st Qu.:2.7 | Class :character | 1st Qu.: 30 | Class :character | 1st Qu.: 30 | Class :character | 1st Qu.: 8 | Class :character | 1st Qu.:27 | Class :character | NA’s:27104 | NA’s:27104 | 1st Qu.: 1102 | Class :character | 1st Qu.: 2 | Class :character | 1st Qu.:1023 | Class :character | 1st Qu.:0 | Class :character | 1st Qu.: 0 | Class :character | 1st Qu.: 0 | Class :character | 1st Qu.:42 | Class :character | 1st Qu.: 48.0 | Class :character | 1st Qu.:34.0 | Class :character | NA’s:27104 | NA’s:27104 | 1st Qu.: 0 | Class :character | 1st Qu.:160 | Class :character | 1st Qu.:160 | Class :character | 1st Qu.:160 | Class :character | 1st Qu.:180 | Class :character | 1st Qu.:360 | Class :character | NA’s:27104 | NA’s:27104 | 1st Qu.:0 | Class :character | 1st Qu.:15 | Class :character | 1st Qu.: 17 | Class :character | 1st Qu.: 21 | Class :character | 1st Qu.:20 | Class :character | 1st Qu.:12 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character | 1st Qu.:1 | Class :character |
| Median | Mode :character | Mode :character | Median :40.6 | Median :-73.8 | Median :2.7 | Mode :character | Median : 60 | Mode :character | Median : 70 | Mode :character | Median : 11 | Mode :character | Median :28 | Mode :character | NA | NA | Median : 1526 | Mode :character | Median : 3 | Mode :character | Median :1458 | Mode :character | Median :0 | Mode :character | Median : 0 | Mode :character | Median : 0 | Mode :character | Median :56 | Mode :character | Median : 62.0 | Mode :character | Median :47.0 | Mode :character | NA | NA | Median : 0 | Mode :character | Median :200 | Mode :character | Median :210 | Mode :character | Median :210 | Mode :character | Median :225 | Mode :character | Median :360 | Mode :character | NA | NA | Median :0 | Mode :character | Median :17 | Mode :character | Median : 21 | Mode :character | Median : 26 | Mode :character | Median :24 | Mode :character | Median :15 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character | Median :1 | Mode :character |
| Mean | NA | NA | Mean :40.6 | Mean :-73.8 | Mean :2.7 | NA | Mean : 58 | NA | Mean : 61 | NA | Mean : 11 | NA | Mean :28 | NA | NA | NA | Mean : 1439 | NA | Mean : 4 | NA | Mean :1384 | NA | Mean :0 | NA | Mean : 0 | NA | Mean : 0 | NA | Mean :56 | NA | Mean : 61.5 | NA | Mean :47.2 | NA | NA | NA | Mean : 160 | NA | Mean :212 | NA | Mean :215 | NA | Mean :215 | NA | Mean :231 | NA | Mean :360 | NA | NA | NA | Mean :0 | NA | Mean :19 | NA | Mean : 22 | NA | Mean : 27 | NA | Mean :26 | NA | Mean :15 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA | Mean :1 | NA |
| Q3 | NA | NA | 3rd Qu.:40.6 | 3rd Qu.:-73.8 | 3rd Qu.:2.7 | NA | 3rd Qu.: 80 | NA | 3rd Qu.: 90 | NA | 3rd Qu.: 13 | NA | 3rd Qu.:30 | NA | NA | NA | 3rd Qu.: 1850 | NA | 3rd Qu.: 4 | NA | 3rd Qu.:1837 | NA | 3rd Qu.:0 | NA | 3rd Qu.: 0 | NA | 3rd Qu.: 0 | NA | 3rd Qu.:71 | NA | 3rd Qu.: 77.0 | NA | 3rd Qu.:62.0 | NA | NA | NA | 3rd Qu.: 0 | NA | 3rd Qu.:300 | NA | 3rd Qu.:310 | NA | 3rd Qu.:310 | NA | 3rd Qu.:315 | NA | 3rd Qu.:360 | NA | NA | NA | 3rd Qu.:0 | NA | 3rd Qu.:22 | NA | 3rd Qu.: 26 | NA | 3rd Qu.: 32 | NA | 3rd Qu.:30 | NA | 3rd Qu.:18 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA | 3rd Qu.:1 | NA |
| Max | NA | NA | Max. :40.6 | Max. :-73.8 | Max. :2.7 | NA | Max. :100 | NA | Max. :100 | NA | Max. :308 | NA | Max. :31 | NA | NA | NA | Max. :32767 | NA | Max. :17 | NA | Max. :2359 | NA | Max. :8 | NA | Max. :30 | NA | Max. :28 | NA | Max. :91 | NA | Max. :104.0 | NA | Max. :82.0 | NA | NA | NA | Max. :2832 | NA | Max. :360 | NA | Max. :360 | NA | Max. :360 | NA | Max. :360 | NA | Max. :360 | NA | NA | NA | Max. :6 | NA | Max. :52 | NA | Max. :314 | NA | Max. :214 | NA | Max. :71 | NA | Max. :21 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA | Max. :1 | NA |
| NA | NA | NA | NA | NA | NA | NA | NA’s :15663 | NA | NA’s :15662 | NA | NA’s :12953 | NA | NA’s :27008 | NA | NA | NA | NA’s :16980 | NA | NA’s :27008 | NA | NA’s :14837 | NA | NA’s :2890 | NA | NA’s :3172 | NA | NA’s :3172 | NA | NA’s :20961 | NA | NA’s :2 | NA | NA’s :2 | NA | NA | NA | NA’s :27073 | NA | NA’s :15668 | NA | NA’s :17455 | NA | NA’s :17538 | NA | NA’s :21983 | NA | NA’s :27103 | NA | NA | NA | NA’s :21033 | NA | NA’s :15665 | NA | NA’s :17455 | NA | NA’s :17539 | NA | NA’s :19641 | NA | NA’s :27102 | NA | NA’s :19021 | NA | NA’s :25713 | NA | NA’s :25673 | NA | NA’s :26713 | NA | NA’s :26730 | NA | NA’s :26896 | NA | NA’s :26992 | NA | NA’s :21556 | NA | NA’s :26924 | NA | NA’s :27090 | NA | NA’s :24919 | NA | NA’s :26217 | NA | NA’s :27062 | NA | NA’s :19315 | NA | NA’s :27044 | NA | NA’s :25677 | NA | NA’s :27099 | NA | NA’s :27055 | NA | NA’s :27103 | NA |
#converting to R date format and adding columns for day, month, and year
JFKweath$DATE <- as.Date(JFKweath$DATE)
JFKweath$day <- format(JFKweath$DATE, format="%d")
JFKweath$month <- format(JFKweath$DATE, format="%m")
JFKweath$year <- format(JFKweath$DATE, format="%Y")
#converting temperature observations to numerics
JFKweath$TMAX <- as.numeric(JFKweath$TMAX)
JFKweath$TMIN <- as.numeric(JFKweath$TMIN)
JFKweath$TAVG <- as.numeric(JFKweath$TAVG)
JFKweath$year <- as.numeric(JFKweath$year)
#Making month a factor
NYweath$month <- as.factor(NYweath$month)
#creating a subset of the weather data
JFKweath_sub <- subset(JFKweath, select = c(DATE, day, month, year, TMAX, TMIN, TAVG, PRCP, SNOW))
xkablesummary(JFKweath_sub)
| DATE | day | month | year | TMAX | TMIN | TAVG | PRCP | SNOW | |
|---|---|---|---|---|---|---|---|---|---|
| Min | Min. :1948-07-17 | Length:27104 | Length:27104 | Min. :1948 | Min. : 8.0 | Min. :-2.0 | Min. : 8 | Min. :0 | Min. : 0 |
| Q1 | 1st Qu.:1967-02-03 | Class :character | Class :character | 1st Qu.:1967 | 1st Qu.: 48.0 | 1st Qu.:34.0 | 1st Qu.:42 | 1st Qu.:0 | 1st Qu.: 0 |
| Median | Median :1985-08-23 | Mode :character | Mode :character | Median :1985 | Median : 62.0 | Median :47.0 | Median :56 | Median :0 | Median : 0 |
| Mean | Mean :1985-08-23 | NA | NA | Mean :1985 | Mean : 61.5 | Mean :47.2 | Mean :56 | Mean :0 | Mean : 0 |
| Q3 | 3rd Qu.:2004-03-12 | NA | NA | 3rd Qu.:2004 | 3rd Qu.: 77.0 | 3rd Qu.:62.0 | 3rd Qu.:71 | 3rd Qu.:0 | 3rd Qu.: 0 |
| Max | Max. :2022-09-30 | NA | NA | Max. :2022 | Max. :104.0 | Max. :82.0 | Max. :91 | Max. :8 | Max. :30 |
| NA | NA | NA | NA | NA | NA’s :2 | NA’s :2 | NA’s :20961 | NA’s :2890 | NA’s :3172 |
jmaxTfit1 <- lm(formula = TMAX ~ year + month, data = JFKweath_sub )
summary(jmaxTfit1)
##
## Call:
## lm(formula = TMAX ~ year + month, data = JFKweath_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.57 -5.17 -0.15 5.04 36.63
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -28.21756 4.45988 -6.33 2.5e-10 ***
## year 0.03380 0.00224 15.06 < 2e-16 ***
## month02 2.02149 0.23936 8.45 < 2e-16 ***
## month03 9.32084 0.23372 39.88 < 2e-16 ***
## month04 19.76311 0.23566 83.86 < 2e-16 ***
## month05 29.41606 0.23375 125.84 < 2e-16 ***
## month06 38.83293 0.23566 164.78 < 2e-16 ***
## month07 44.45024 0.23334 190.49 < 2e-16 ***
## month08 43.24112 0.23295 185.63 < 2e-16 ***
## month09 36.60503 0.23487 155.85 < 2e-16 ***
## month10 26.04513 0.23373 111.43 < 2e-16 ***
## month11 15.05592 0.23567 63.88 < 2e-16 ***
## month12 4.75950 0.23376 20.36 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.92 on 27089 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.795, Adjusted R-squared: 0.795
## F-statistic: 8.78e+03 on 12 and 27089 DF, p-value: <2e-16